### Replication code for the manuscript
### Model-based Clustering and Typologies in the Social Sciences
### John S. Ahlquist and Christian Breunig
### Version: 30.September.2011
##################################################################
### The replication uses R 2.13.1 with  packages listed below
### and Matlab with the mixmod package (http://www.mixmod.org)
### This R file contains code that generates some files that are
### required for analysis in Matlab.  We use the R.matlab package
### which opens Matlab directly from R and allows us to set
### matlab parameter values using stored R objects. In this    
### replication archive we insert the matlab code directly.    
### All matlab code is preceded with a #% whereas comments  
### for expression in R are preceded by # or ###.
### See the README file for further detail.
##################################################################


### load the required packages and source some additional functions
library(MASS)
library(mclust)
library(R.matlab)
library(clustvarsel)
source("surfacePlotG.R") 

#setwd("foo")
#we recommend using the same location where you unzipped Codes.zip
#for ease of R-Matlab switching  

### Simulated data and model selection
## Figure 1 
## comparing MBC-BIC with MBC-BIC-ICL
# generate simulated data
set.seed(5.6)
sig<-matrix(c(1,0,0,1),ncol=2)
sig1<-matrix(c(1,.8,.8,1),ncol=2)
sig2<-matrix(c(1,-.8,-.8,1),ncol=2)
x<-mvrnorm(n=500,mu=c(-5,5),Sigma=sig)
x1<-mvrnorm(n=500,mu=c(1,1),Sigma=sig1)
x2<-mvrnorm(n=500,mu=c(1,1),Sigma=sig2)
X<-rbind(x,x1,x2)
# perform MBC
sim.mbc<-Mclust(X,G=1:8,warn=T, prior=priorControl()) 
sim.mbc
# 
# write.table(X, file = "X.csv", na = "NA", row.names = F,col.names = F, sep = ",")
# plot Figure 1 (left and middle plot; the third plot is from Matlab)
pdf("Figure1.pdf", height = 5, width = 9.5, onefile = T,paper = "special") 
par(mfrow=c(1,2))
plot(x,xlim=c(-10,5),ylim=c(-5,10),ylab="",xlab="",pch=19)
title("Simulated Data")
points(x1,col= grey(.6),pch=17)
points(x2, col = grey(.3),pch=23)
mclust2Dplot(X,parameters=sim.mbc$parameters,z=sim.mbc$z, what="uncertainty",xlim=c(-10,5),ylim=c(-5,10),cex=.7)
title(main = "MBC-BIC with uncertainty")
dev.off()

# comparing MBC-BIC with MBC-BIC-ICL
# Connect with MATLAB
Matlab$startServer()
matlab <- Matlab()
isOpen <- open(matlab)
Sys.sleep(15) #allow time for R to open connection with Matlab
              #R won't automatically wait for a connection before executing the rest of the code 
print(matlab) #check that Matlab connection is open
setVariable(matlab, exp.data.obs = X)
setVariable(matlab, exp.res.BIC.m = t(sim.mbc$parameters$mean))
setVariable(matlab, exp.res.BIC.S = sim.mbc$parameters$variance$sigma)
setVariable(matlab, exp.res.BIC.p = t(sim.mbc$parameters$pro))
setVariable(matlab, exp.res.BIC.K = t(sim.mbc$G))
close(matlab)

# MATLAB code
#% cd 'C:\Codes' %Directory where Codes was placed.
#% MixCombi
#% PlotResults
#% PlotEntropy

# remove objects
rm("sig","sig1","sig2","sim.mbc","x","X","x1","x2","matlab")

### Varities of Capitalism replication 
## basic classification set up
countries<-c("AUS", "CAN","GBR","JPN","CHE","USA","AUT","BEL","DEU","FRA","ITA","DNK","FIN","NLD","NOR","SWE",
 "GRC", "IRL", "NZL","PRT","ESP") # from table 1.
VoC.class<-c("LME","LME","LME","CME","CME","LME","CME","CME","CME","NC","NC","CME","CME","CME","CME","CME","NC",
  "LME","LME","NC", "NC")
#assing colors: LME=black, CME=red, NC = green
VoC.col<-c(1,1,1,2,2,1,2,2,2,3,3,2,2,2,2,2,3,1,1,3,3) 

## Iversen et al replication
# read data and organize it
iver <- read.csv("iversen.csv")
iver$iver.voc<-VoC.class[match(iver$wbcode,countries)]
iver$iver.col<-VoC.col[match(iver$wbcode,countries)]
rownames(iver)<-iver$wbcode
attach(iver)
dat<-cbind(epl,cdp,cop,nurr,gb,dsj)   
rownames(dat)<-iver$wbcode
# perfrom principle component analysis for 
# the two substantive subsets (UP and EP)
emp.pca<-prcomp(cbind(epl,cdp, cop),retx=T,scale=T)
unm.pca<-prcomp(cbind(nurr, gb,dsj),retx=T,scale=T)
# take first component of each dim in order to match Iversen et al.
pca.dim<-cbind(emp.pca$x[,1],unm.pca$x[,1])
colnames(pca.dim)<-c("Employment Protection PC1","Unemployment Protection PC1")

# write principle component data for MATLAB use 
# for footnote 16 - perform MBC in MATLAB with the ICL criterion
write.table(pca.dim, file = "pca-dim-iv.txt", na = "NA", row.names = F,col.names = F)

#Matlab code to be executed at will; no need to set up R-Matlab link
#% ivpca = importdata('pca-dim-iv.txt');
#% nbCluster = 2:4;
#% crit = 'ICL';
#% ivout1 = mixmod(ivpca,nbCluster,'criterion',crit);
#% ivout1.modelOutput(1)
#% ivout1.modelOutput(1).nbCluster
#% ivout1.modelOutput(1).proba.partition


# perform MBC
iver.pca.clust.np<-Mclust(pca.dim,G=1:8,warn=T)  
# => singluar covariance
iver.pca.clust<-Mclust(pca.dim,G=1:8,warn=T, prior=priorControl()) 
# country classifcation probabilities
# without priors
data.frame(cbind(rownames(iver),round(iver.pca.clust.np$z,digits=5))) 
# with priors
data.frame(cbind(rownames(iver),round(iver.pca.clust$z,digits=5)))    

# Figure 3 - MBC on first two dimensions without priors
pdf("Figure3.pdf", height = 5, width = 14, onefile = T,paper = "special") 
par(mfrow=c(1,3),mar=c(4.6, 4.1, 3.6, 0.6))
mclust2Dplot(pca.dim,parameters=iver.pca.clust.np$parameters, z=iver.pca.clust.np$z,
             what="classification", identify=F, truth=iver$iver.voc, colors="white")
text(pca.dim[,1], pca.dim[,2],rownames(dat),col=iver$iver.col,font=2)
title(main="Classification without prior")
surfacePlotG(pca.dim, ,parameters=iver.pca.clust.np$parameters, type = "contour",
             what = "density", transformation = "none", nlevels=4)
text(pca.dim[,1], pca.dim[,2],rownames(dat),col=iver$iver.col,font=2)
title(main="Classification without prior")
plot.mclustBIC(iver.pca.clust.np$BIC,legendArgs = list(x="bottomleft", ncol=4,cex=.75), ylim=c(-170,-115))
title(main="BIC without prior")
abline(h=iver.pca.clust.np$bic-10, lty=2, lwd=2)
dev.off()

#Figure 4 -  - MBC on first two dimensions with priors
pdf("Figure4.pdf", height = 5, width = 14, onefile = T,paper = "special") 
par(mfrow=c(1,3),mar=c(4.6, 4.1, 3.6, 0.6))
mclust2Dplot(pca.dim,parameters=iver.pca.clust$parameters, z=iver.pca.clust$z,
             what="classification", identify=F, truth=iver$iver.voc, colors="white")
text(pca.dim[,1], pca.dim[,2],rownames(dat),col=iver$iver.col,font=2)
title(main="Classification with prior")
par(mar=c(4.6, 4.1, 3.6, 0.6))
surfacePlotG(pca.dim, ,parameters=iver.pca.clust$parameters, type = "contour",
             what = "density", transformation = "none", nlevels=4)
text(pca.dim[,1], pca.dim[,2],rownames(dat),col=iver$iver.col,font=2)
title(main="Classification with prior")
par(mar=c(4.6, 4.1, 3.6, 0.6))
plot.mclustBIC(iver.pca.clust$BIC,legendArgs = list(x="bottomleft",ncol=4,cex=.75),
               main="With prior")
title(main="BIC with prior")
abline(h=iver.pca.clust$bic-10, lty=2, lwd=2)
dev.off()

# for discussion in manuscript
# check 3- and 4- cluster solutions because BIC not different enough
iver.EEI3<-Mclust(pca.dim,G=3,modelNames="EEI",warn=T, prior=priorControl())
iver.EEI4<-Mclust(pca.dim,G=4,modelNames="EEI",warn=T, prior=priorControl())
mclust2Dplot(pca.dim,parameters=iver.EEI3$parameters, z=iver.EEI3$z,what="classification", identify=F, truth=iver$iver.voc)
mclust2Dplot(pca.dim,parameters=iver.EEI4$parameters, z=iver.EEI4$z,what="classification", identify=F, truth=iver$iver.voc)
# => LMEs still together
#Using BIC-ICL-Entropy procedure
#connecting to Matlab
Matlab$startServer()
matlab<-Matlab()
isOpen <- open(matlab)
print(matlab) #check that Matlab connection is open
setVariable(matlab, exp.data.obs = pca.dim)
setVariable(matlab, exp.res.BIC.m = t(iver.EEI4$parameters$mean))
setVariable(matlab, exp.res.BIC.S = iver.EEI4$parameters$variance$sigma)
setVariable(matlab, exp.res.BIC.p = t(iver.EEI4$parameters$pro))
setVariable(matlab, exp.res.BIC.K = t(iver.EEI4$G))
close(matlab)

# MATLAB code
#% cd 'C:\Codes' %Directory where Codes was placed.
#% MixCombi
#% PlotResults
#% PlotEntropy


# for discussion in manuscript:
# use weaker assumptions about what variables provide clustering information
# checking the clustering over each set of principle components using variable selection
pca.iver<-cbind(emp.pca$x,unm.pca$x)
colnames(pca.iver)<-c("PC1 emp", "PC2 emp", "PC3 emp", "PC1 unem", "PC2 unem", "PC3 unem")
pc.select<-clustvarsel(pca.iver, G=8)
# perform MBC
# with prior => 1 cluster
iver.pca.sel.clust1<-Mclust(pc.select$sel.var, G=1:8, prior=priorControl(), warn=T) 
# without prio r=> 5 clusters
iver.pca.sel.clust2<-Mclust(pc.select$sel.var, G=1:8, warn=T) 
# for discussion in manuscript:
# compute adjusted rand index between new soluation and original VOC clusters
adjustedRandIndex(iver.pca.sel.clust2$classification,iver$iver.col)
# redo the procedure using principle components on all data
all.pca<-prcomp(dat,retx=T,scale=T)
pc.select.all<-clustvarsel(all.pca$x,G=8)

# write chosen principle components for mixmod use
# use this file & mixmod to verify footnote 18
write.table(pc.select.all$sel.var, file = "pc-select-all.txt", na = "NA", row.names = F,col.names = F)
# perform MBC with and without prior
# prior => 1 cluster
iver.pca.all.clust1<-Mclust(pc.select.all$sel.var, G=1:8, warn=T,prior=priorControl()) 
# no prior => 5 clusters
iver.pca.all.clust2<-Mclust(pc.select.all$sel.var, G=1:8, warn=T)  
# country classifcation probabilities for the five cluster solution
iver.pca.all.clust2$z

## Hall and Gingerich replication
# read data and organize it
hg.rep.data<-read.csv("H&G-Rep.csv")
# noise on JPN, CHE a la H-G
noise.hg<-c(FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,
            FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE) 
# noise for "Mediterrenean cluster"
noise.latin<-c(FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,TRUE,FALSE,
               FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) 
rownames(hg.rep.data)<-hg.rep.data$wbcode
hg.voc<-VoC.class[match(hg.rep.data$wbcode,countries)]
hg.col<-VoC.col[match(hg.rep.data$wbcode,countries)]
hg.rep.data<-cbind(hg.rep.data, noise.hg, noise.latin,hg.voc,hg.col)
hg.rep.nomiss<-na.omit(hg.rep.data) # remove observations with missing values

# perform principle component analysis
hg.cap.pr<-prcomp(hg.rep.nomiss[,4:6], scale=T)
hg.lab.pr<-prcomp(hg.rep.nomiss[,7:9], scale=T)
pca.dim<-cbind(hg.cap.pr$x[,1],hg.lab.pr$x[,1])
colnames(pca.dim)<-c("Corp. gov. PC1", "Lab. Rel. PC1")
# write first principle components from each dimension for mixmod use
# this file will be used for footnote 21
write.table(pca.dim, file = "pca-dim-hg.txt", na = "NA", row.names = F,col.names = F)

#Matlab code to be executed at will; no need to set up R-Matlab link
#% hgpca = importdata('pca-dim-hg.txt');
#% nbCluster = 2:6;
#% crit = 'ICL';
#% hgout1 = mixmod(hgpca,nbCluster,'criterion',crit);
#% hgout1.modelOutput(1)
#% hgout1.modelOutput(1).nbCluster
#% hgout1.modelOutput(1).proba.partition


# perform MBC with prior
hg.pr.first.lab.cap<-Mclust(pca.dim, G=1:9, warn=T, prior=priorControl()) 
# display results for discussion in manuscript
hg.pr.first.lab.cap
round(hg.pr.first.lab.cap$z,digits=5)

#Using BIC-ICL-Entropy procedure
#connecting to Matlab
isOpen <- open(matlab)
print(matlab) #check that Matlab connection is open
setVariable(matlab, exp.data.obs = pca.dim)
setVariable(matlab, exp.res.BIC.m = t(hg.pr.first.lab.cap$parameters$mean))
setVariable(matlab, exp.res.BIC.S = hg.pr.first.lab.cap$parameters$variance$sigma)
setVariable(matlab, exp.res.BIC.p = t(hg.pr.first.lab.cap$parameters$pro))
setVariable(matlab, exp.res.BIC.K = t(hg.pr.first.lab.cap$G))
close(matlab)

# MATLAB code
#% cd 'C:\Codes' %Directory where Codes was placed.
#% MixCombi
#% PlotResults
#% PlotEntropy


# Figure 5 - MBC with the first two principle components
pdf("Figure5.pdf", height = 5, width = 14, onefile = T,paper = "special") 
par(mfrow=c(1,3),mar=c(4.6, 4.1, 3.6, 0.6))
mclust2Dplot(pca.dim,parameters=hg.pr.first.lab.cap$parameters, z=hg.pr.first.lab.cap$z,
             what="classification", identify=F, truth=hg.rep.nomiss$hg.voc, colors="white",
             xlim=c(-1.5,3.3), ylim=c(-2.2,2.5))
text(pca.dim[,1], pca.dim[,2],rownames(hg.rep.nomiss),col=hg.rep.nomiss$hg.col,font=2)
title(main="Classification in two variables")
surfacePlotG(pca.dim, ,parameters=hg.pr.first.lab.cap$parameters, type = "contour", what = "density",
             transformation = "none", nlevels=4)
text(pca.dim[,1], pca.dim[,2],rownames(hg.rep.nomiss),col=hg.rep.nomiss$hg.col,font=2)
title(main="Classification in two variables")
plot.mclustBIC(hg.pr.first.lab.cap$BIC, legendArgs = list(x="bottomleft",ncol=4,cex=.75))
title(main="BIC")
abline(h=hg.pr.first.lab.cap$bic-10, lwd=2, lty=2)
dev.off()

# for discussion in manuscript:
# use variable selection seperately for each dim
hg.cap.varsel<-clustvarsel(hg.cap.pr$x,G=8) 
hg.lab.varsel<-clustvarsel(hg.lab.pr$x,G=8) 
hg.pc.sel.labcap<-cbind(hg.cap.varsel$sel.var, hg.lab.varsel$sel.var) 
colnames(hg.pc.sel.labcap)<-c("Corp. gov. PC1","Corp. gov. PC3","Lab. rel. PC1", "Lab. rel. PC3")


# perform MBC with prior
hg.pr.sel.lab.cap<-Mclust(hg.pc.sel.labcap, G=1:9, warn=T,prior=priorControl()) 
# display results
round(hg.pr.sel.lab.cap$z,digits=5)
# for discussion in manuscript:
# compare both cluster solulations to Hall and Gingerich
adjustedRandIndex(hg.pr.first.lab.cap$classification,hg.rep.nomiss$hg.voc) 
adjustedRandIndex(hg.pr.sel.lab.cap$classification,hg.rep.nomiss$hg.voc)

##END
